import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Read the Excel file into a pandas DataFrame
df = pd.read_excel('/Users/Cench/Desktop/Cantonese_opera /Third_Stage/Coder_Reliability Test/Inter-coder relibility .xlsx')

# Define the variable names
variable_names = ['HD', 'AT', 'VT', 'VTT', 'VTST', 'ST', 'T', 'CL']

# Set the number of bootstrap iterations
n_bootstrap = 1000

# Iterate over each variable and calculate the Kappa coefficient
for i in range(8):
    coder_c_ratings = df.iloc[:, i]  # Ratings for coder C
    coder_f_ratings = df.iloc[:, i + 8]  # Ratings for coder F
    coder_h_ratings = df.iloc[:, i + 16]  # Ratings for coder H

    # Drop rows with missing values and align the lengths of coder C and F ratings
    coder_c_ratings = coder_c_ratings.dropna()
    coder_f_ratings = coder_f_ratings.dropna()
    coder_c_ratings = coder_c_ratings[:len(coder_f_ratings)]

    # Convert ratings to integer format
    coder_c_ratings = coder_c_ratings.astype(int)
    coder_f_ratings = coder_f_ratings.astype(int)
    coder_h_ratings = coder_h_ratings.dropna().astype(int)

    # Calculate the observed Cohen's Kappa coefficient
    kappa_c_f_observed = cohen_kappa_score(coder_c_ratings, coder_f_ratings)
    kappa_c_h_observed = cohen_kappa_score(coder_c_ratings, coder_h_ratings)
    kappa_f_h_observed = cohen_kappa_score(coder_f_ratings, coder_h_ratings)

    # Perform bootstrap resampling and calculate p-values for Cohen's Kappa
    kappa_c_f_bootstrap = []
    kappa_c_h_bootstrap = []
    kappa_f_h_bootstrap = []

    for _ in range(n_bootstrap):
        # Perform bootstrap resampling by randomly sampling with replacement
        coder_c_resampled = np.random.choice(coder_c_ratings, size=len(coder_c_ratings), replace=True)
        coder_f_resampled = np.random.choice(coder_f_ratings, size=len(coder_f_ratings), replace=True)
        coder_h_resampled = np.random.choice(coder_h_ratings, size=len(coder_h_ratings), replace=True)

        # Calculate Cohen's Kappa coefficient for the resampled ratings
        kappa_c_f_resampled = cohen_kappa_score(coder_c_resampled, coder_f_resampled)
        kappa_c_h_resampled = cohen_kappa_score(coder_c_resampled, coder_h_resampled)
        kappa_f_h_resampled = cohen_kappa_score(coder_f_resampled, coder_h_resampled)

        kappa_c_f_bootstrap.append(kappa_c_f_resampled)
        kappa_c_h_bootstrap.append(kappa_c_h_resampled)
        kappa_f_h_bootstrap.append(kappa_f_h_resampled)

    # Calculate the p-values for Cohen's Kappa
    p_value_c_f = (np.abs(kappa_c_f_bootstrap) >= np.abs(kappa_c_f_observed)).mean()
    p_value_c_h = (np.abs(kappa_c_h_bootstrap) >= np.abs(kappa_c_h_observed)).mean()
    p_value_f_h = (np.abs(kappa_f_h_bootstrap) >= np.abs(kappa_f_h_observed)).mean()

    variable_name = variable_names[i]  # Get the variable name from the list

    print(f"{variable_name}:")
    print("Cohen's Kappa between C and F: {:.4f} (p-value: {:.4f})".format(kappa_c_f_observed, p_value_c_f))
    print("Cohen's Kappa between C and H: {:.4f} (p-value: {:.4f})".format(kappa_c_h_observed, p_value_c_h))
    print("Cohen's Kappa between F and H: {:.4f} (p-value: {:.4f})".format(kappa_f_h_observed, p_value_f_h))
    print()